#!/bin/bash
# Copyright (c) Los Alamos National Security, LLC, and others.

# Compute keyword time series and related analyses.

. ./environment.sh

cp $QUACBASE/misc/halloween.xls $DATADIR
cd $DATADIR

exec 2>&1


## ngrams-build ##

# We use several partitions so that we can test that
# hash-based time series lookup works.
echo "$ ngrams-build ..."
ngrams-build --notimes --partitions 10 --min-occur 10 --run 1 --clean --jobdir series $TWEETDIR/pre > /dev/null

x ls -R series
echo "$ dump-pickle total.pkl.gz ..."
dump-pickle series/out/total.pkl.gz


## ngrams-search ##

# "candy" should peak near Halloween. (Note that midnight UTC is 8pm EST, so a
# lot of trick-or-treating related traffic will happen on November 1 UTC.)
y "ngrams-search --query 't@ candy' series/out"
y "ngrams-search --detail --query 't@ candy' series/out"

# "sandy" is the hurricane.
y "ngrams-search --query 't@ sandy' series/out"
y "ngrams-search --detail --query 't@ sandy' series/out"

# We include "a" because it's an n-gram that had trouble being sorted (see
# commit 885d3d)
y "ngrams-search --query 't@ a' series/out"

# Make sure zeros are inserted properly: "taco" has no data on October 28
# (beginning), November 1 (middle), and November 4/5 (end).
y "ngrams-search --detail --query 't@ taco' series/out"


## ngrams-correlate ##

# Build the correlations. The sample rate is so tiny because:
#
#     0.01   spritzer 1%
#   * 0.05   5% sample for testing
#   * 0.015  1.5% of tweets are geotagged
#   * 0.5    last day of test sample is incomplete
#   -------
#   = 3.75e-6
#
echo "$ ngrams-correlate ..."
ngrams-correlate --notimes --min-similarity 0.7 --min-ppm 2000 --tw-sample-rate 3.75e-6 --run 1 --clean --jobdir corr series/out halloween.xls > /dev/null

# Results
x ls -R corr
x head -10 corr/out/halloween:halloween.tsv
x head -10 corr/out/halloween:s%40ndy.tsv
